In [1]:
#Importing necessary Packages
import warnings
import itertools
import numpy as np
import pandas as pd
import statsmodels.api as sm
from collections import Counter
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.pylab import rcParams
%matplotlib inline
In [2]:
#Loading the  dataset at zip code granular
##https://www.zillow.com/research/data/
df = pd.read_csv('C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\Zip_zhvi_uc_sfr_tier_0.33_0.67_sm_sa_month.csv')
In [3]:
##display 5 records
df.head(5)
Out[3]:
RegionID SizeRank RegionName RegionType StateName State City Metro CountyName 2000-01-31 ... 2024-03-31 2024-04-30 2024-05-31 2024-06-30 2024-07-31 2024-08-31 2024-09-30 2024-10-31 2024-11-30 2024-12-31
0 91982 1 77494 zip TX TX Katy Houston-The Woodlands-Sugar Land, TX Fort Bend County 209183.857849 ... 490931.167974 493431.867652 494729.768844 495028.711253 494651.061479 495058.369196 495747.641095 496870.232365 497591.216589 498013.381149
1 61148 2 8701 zip NJ NJ Lakewood New York-Newark-Jersey City, NY-NJ-PA Ocean County 154922.985332 ... 689106.574503 699661.801780 708559.971483 715094.582736 720443.771066 726000.038135 733103.210025 740083.000281 745160.491341 747964.767711
2 91940 3 77449 zip TX TX Katy Houston-The Woodlands-Sugar Land, TX Harris County 103712.508410 ... 280754.700629 281717.848068 282255.119080 282218.313457 281986.353330 281807.105654 281620.894517 281206.739260 280422.308405 279697.082664
3 62080 4 11368 zip NY NY New York New York-Newark-Jersey City, NY-NJ-PA Queens County 302696.952652 ... 885946.861384 895038.513007 901396.636146 901967.282407 900606.279548 899885.353997 902459.710773 904617.265178 905818.780045 905896.339783
4 91733 5 77084 zip TX TX Houston Houston-The Woodlands-Sugar Land, TX Harris County 102666.820965 ... 274609.487817 275605.670852 276105.868652 275935.732220 275641.755834 275374.731211 275130.328740 274631.886651 274039.496837 273706.228700

5 rows × 309 columns

In [4]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)
Missing values in each column:
 RegionID      0
SizeRank      0
RegionName    0
RegionType    0
StateName     0
             ..
2024-08-31    0
2024-09-30    0
2024-10-31    0
2024-11-30    0
2024-12-31    0
Length: 309, dtype: int64
In [5]:
# Drop columns with all missing values
df_cleaned = df.dropna(axis=1, how='all')
In [6]:
# Fill missing values with the mean of the column
#dd_cleaned = df_cleaned.fillna(data_cleaned.mean())
In [7]:
# Display the cleaned dataframe
print(df_cleaned.head(5))
   RegionID  SizeRank  RegionName RegionType StateName State      City  \
0     91982         1       77494        zip        TX    TX      Katy   
1     61148         2        8701        zip        NJ    NJ  Lakewood   
2     91940         3       77449        zip        TX    TX      Katy   
3     62080         4       11368        zip        NY    NY  New York   
4     91733         5       77084        zip        TX    TX   Houston   

                                   Metro        CountyName     2000-01-31  \
0   Houston-The Woodlands-Sugar Land, TX  Fort Bend County  209183.857849   
1  New York-Newark-Jersey City, NY-NJ-PA      Ocean County  154922.985332   
2   Houston-The Woodlands-Sugar Land, TX     Harris County  103712.508410   
3  New York-Newark-Jersey City, NY-NJ-PA     Queens County  302696.952652   
4   Houston-The Woodlands-Sugar Land, TX     Harris County  102666.820965   

   ...     2024-03-31     2024-04-30     2024-05-31     2024-06-30  \
0  ...  490931.167974  493431.867652  494729.768844  495028.711253   
1  ...  689106.574503  699661.801780  708559.971483  715094.582736   
2  ...  280754.700629  281717.848068  282255.119080  282218.313457   
3  ...  885946.861384  895038.513007  901396.636146  901967.282407   
4  ...  274609.487817  275605.670852  276105.868652  275935.732220   

      2024-07-31     2024-08-31     2024-09-30     2024-10-31     2024-11-30  \
0  494651.061479  495058.369196  495747.641095  496870.232365  497591.216589   
1  720443.771066  726000.038135  733103.210025  740083.000281  745160.491341   
2  281986.353330  281807.105654  281620.894517  281206.739260  280422.308405   
3  900606.279548  899885.353997  902459.710773  904617.265178  905818.780045   
4  275641.755834  275374.731211  275130.328740  274631.886651  274039.496837   

      2024-12-31  
0  498013.381149  
1  747964.767711  
2  279697.082664  
3  905896.339783  
4  273706.228700  

[5 rows x 309 columns]
In [8]:
# Distribution of home values in a specific region over time
region_name = '68104'  
region_data = df_cleaned[df_cleaned['RegionName'] == int(region_name)].iloc[:, 10:].T
region_data.columns = ['HomeValue']
region_data.index = pd.to_datetime(region_data.index)

plt.figure(figsize=(12, 6))
plt.plot(region_data.index, region_data['HomeValue'], marker='o',color='blue')
plt.title(f'Home Values Over Time in Region {region_name}')
plt.xlabel('Year')
plt.ylabel('Home Value')
plt.grid(True)
plt.show()
In [9]:
print('''The above chart shows the property value trends over time for zip code 68104. 
### Summary of the Chart:
X-axis: Represents the time period from around 2000 to 2024.
Y-axis: Represents the property values.

### Key Observations:
Starting Value: Property values started at approximately $60,000 in 2000.
Trend: There is a general upward trend in property values over this period, with some fluctuations.
Significant Increases: Notable increases are observed starting around 2012, with a steep rise continuing through to 2024.
Ending Value: By 2024, property values have reached approximately $180,000.

### Interpretation of the Chart:
Substantial Growth: Property values in zip code 68104 have shown substantial growth over the past two decades.
Stable Period: From 2000 to around 2012, property values remained relatively stable.
Marked Increase: After 2012, there was a marked increase in property values, suggesting increasing demand or other economic factors positively influencing property prices in this area.
Economic Factors: The upward trend indicates that the area has become more desirable, possibly due to improvements in infrastructure, amenities, or overall economic growth.
 
''')
The above chart shows the property value trends over time for zip code 68104. 
### Summary of the Chart:
X-axis: Represents the time period from around 2000 to 2024.
Y-axis: Represents the property values.

### Key Observations:
Starting Value: Property values started at approximately $60,000 in 2000.
Trend: There is a general upward trend in property values over this period, with some fluctuations.
Significant Increases: Notable increases are observed starting around 2012, with a steep rise continuing through to 2024.
Ending Value: By 2024, property values have reached approximately $180,000.

### Interpretation of the Chart:
Substantial Growth: Property values in zip code 68104 have shown substantial growth over the past two decades.
Stable Period: From 2000 to around 2012, property values remained relatively stable.
Marked Increase: After 2012, there was a marked increase in property values, suggesting increasing demand or other economic factors positively influencing property prices in this area.
Economic Factors: The upward trend indicates that the area has become more desirable, possibly due to improvements in infrastructure, amenities, or overall economic growth.
 

In [10]:
#Dataframe containing only data for Metro= 'Omaha-Council Bluffs, NE-IA' and state NE
Douglas_County_df =  df_cleaned[(df_cleaned['Metro'] == 'Omaha-Council Bluffs, NE-IA') & (df['State'] =='NE')]
Douglas_County_df.head(5) ##74 rows
Out[10]:
RegionID SizeRank RegionName RegionType StateName State City Metro CountyName 2000-01-31 ... 2024-03-31 2024-04-30 2024-05-31 2024-06-30 2024-07-31 2024-08-31 2024-09-30 2024-10-31 2024-11-30 2024-12-31
2297 87855 2327 68104 zip NE NE Omaha Omaha-Council Bluffs, NE-IA Douglas County 61845.391049 ... 185160.710986 186644.242877 187791.405578 188147.951130 188347.104765 187877.484359 187293.870262 186750.679296 186918.838632 188001.320669
2978 87866 3016 68116 zip NE NE Omaha Omaha-Council Bluffs, NE-IA Douglas County 160146.962255 ... 360014.507509 361201.696645 362065.723010 362382.759677 362877.616611 363285.613712 363814.343178 364471.070866 365379.998067 366662.100117
3209 87811 3249 68022 zip NE NE Omaha Omaha-Council Bluffs, NE-IA Douglas County 232227.952627 ... 489064.713115 491103.233097 492741.813566 493406.193632 493826.757872 494095.134680 494514.837025 495216.566507 495832.424226 496803.212295
3315 87880 3356 68134 zip NE NE Omaha Omaha-Council Bluffs, NE-IA Douglas County 98567.912015 ... 238592.511319 239866.692413 240967.030857 241484.981002 241982.593922 241972.676199 241969.955464 241884.931339 242249.207926 243141.689693
3433 87830 3474 68046 zip NE NE Papillion Omaha-Council Bluffs, NE-IA Sarpy County 170796.025558 ... 382245.904584 382833.611747 383008.730987 382491.955618 382419.505359 382979.577767 384197.039017 385515.919123 386691.873029 387832.049927

5 rows × 309 columns

In [11]:
Douglas_County_df.to_csv("C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\output_sb.csv")
In [12]:
#Zip Codes located within the Douglas County region

fig, ax = plt.subplots(figsize=(15,12))

y = [a for a in Douglas_County_df['City'].value_counts()]

x = [a for a in Douglas_County_df['City'].value_counts().keys()]

ax.barh(x,y,color='brown')
ax.set_title('Cities in Douglas County',fontsize=30)
ax.set_ylabel('Cities',fontsize=20)
ax.set_xlabel('Number of Zipcodes in the City',fontsize=20);
In [13]:
print('''
The chart titled Cities in Douglas County displays the number of zip codes in various cities within Douglas County. Here's a breakdown of the chart:

X- axis: Represents the "Number of Zipcodes in the City," ranging from 0 to 30.
Y-axis: Lists the names of different cities in Douglas County.

### Key Observations:
Omaha Has the highest number of zip codes, close to 30, indicating it is the largest city in terms of zip code distribution.
Bellevue Follows with around 3 zip codes.
Other Cities Cities like Valley, Waterloo, and Bennington have fewer than three zip codes each.
Smaller Cities Most other cities listed have only one or two zip codes.

'''
     )
The chart titled Cities in Douglas County displays the number of zip codes in various cities within Douglas County. Here's a breakdown of the chart:

X- axis: Represents the "Number of Zipcodes in the City," ranging from 0 to 30.
Y-axis: Lists the names of different cities in Douglas County.

### Key Observations:
Omaha Has the highest number of zip codes, close to 30, indicating it is the largest city in terms of zip code distribution.
Bellevue Follows with around 3 zip codes.
Other Cities Cities like Valley, Waterloo, and Bennington have fewer than three zip codes each.
Smaller Cities Most other cities listed have only one or two zip codes.


In [14]:
#Filtering for Customers with budget of 300000 in Omaha-Council Bluffs, NE-IA Metro
Douglas_County_budget = Douglas_County_df[Douglas_County_df['2024-12-31']<=300000]
Douglas_County_budget.head(5)
Out[14]:
RegionID SizeRank RegionName RegionType StateName State City Metro CountyName 2000-01-31 ... 2024-03-31 2024-04-30 2024-05-31 2024-06-30 2024-07-31 2024-08-31 2024-09-30 2024-10-31 2024-11-30 2024-12-31
2297 87855 2327 68104 zip NE NE Omaha Omaha-Council Bluffs, NE-IA Douglas County 61845.391049 ... 185160.710986 186644.242877 187791.405578 188147.951130 188347.104765 187877.484359 187293.870262 186750.679296 186918.838632 188001.320669
3315 87880 3356 68134 zip NE NE Omaha Omaha-Council Bluffs, NE-IA Douglas County 98567.912015 ... 238592.511319 239866.692413 240967.030857 241484.981002 241982.593922 241972.676199 241969.955464 241884.931339 242249.207926 243141.689693
3637 87858 3680 68107 zip NE NE Omaha Omaha-Council Bluffs, NE-IA Douglas County NaN ... 169186.248724 170364.472709 171447.742422 172266.922477 172682.737537 172473.963275 171863.443067 171391.584374 171530.134221 172501.351022
3756 87894 3801 68164 zip NE NE Omaha Omaha-Council Bluffs, NE-IA Douglas County 121613.700108 ... 273720.719422 275016.216030 275950.743521 276211.076942 276273.706253 276165.584832 276466.455383 276778.757067 277302.349579 278065.714655
4733 87883 4795 68137 zip NE NE Omaha Omaha-Council Bluffs, NE-IA Douglas County 115569.772369 ... 263510.808942 264846.039979 265950.025928 266242.996832 266468.031601 266588.847157 266886.168878 267178.123198 267585.412526 268627.929641

5 rows × 309 columns

In [15]:
#List of Zip Codes in theOmaha-Council Bluffs, NE-IA Metro within client's budget.
Douglas_County_budget_zips = [a for a in Douglas_County_budget['RegionName']]
In [16]:
#Fetching necessary data from the dataframe for the bar chart

#Getting City Names
city_names = []

for zipcode in Douglas_County_budget_zips:
    city_names.append(Douglas_County_budget[Douglas_County_budget['RegionName']==zipcode].iloc[0]['City'])

#Retrieving number of zipcodes for each city and feeding it to count_dict
count_dict = {}
for a in Counter(city_names).keys():
    count_dict[a] = Counter(city_names)[a]

#sorting dictionary into ascending order
count_dict = dict(sorted(count_dict.items(), key=lambda item: item[1]))

#Building the bar chart
fig,ax = plt.subplots(figsize=(21,12))

x_labels = [a for a in count_dict.keys()]
x = list(range(1,len(x_labels)+1))
y = [a for a in count_dict.values()]

ax.bar(x,y,color='rebeccapurple',width=0.5)

ax.set_xticks(x)
ax.set_xticklabels(x_labels,fontsize='12',rotation=90)
ax.set_title("Cities with zipcodes fitting our Client's budget",fontsize='30')
ax.set_ylabel("Number of Zipcodes",fontsize='20')
ax.set_xlabel("Cities in Douglas County",fontsize='20');
In [17]:
print('''The chart titled Cities with zipcodes fitting our Client's budget shows the number of zip codes in various cities within Douglas County that fit the client's budget. 

X-axis: Represents different cities in Douglas County.
Y-axis: Represents the number of zip codes fitting the client's budget, ranging from 0 to 20, with increments of 2.5.

### Key Observations:
Omaha: Has the highest number of zip codes fitting the client's budget, with approximately 18 zip codes.
Bellevue: Has a moderate number of zip codes, around 5.
Other Cities: Cities like La Vista, Ralston, Millard, Bennington, Waterloo, Elkhorn, Valley, Cedar Bluffs, Gretna, Springfield, Papillion, and Boys Town have fewer zip codes, mostly around 1 or less.

This bar chart effectively illustrates that Omaha has the most zip codes that fit the client's budget, significantly more than any other city in Douglas County. 
''')
The chart titled Cities with zipcodes fitting our Client's budget shows the number of zip codes in various cities within Douglas County that fit the client's budget. 

X-axis: Represents different cities in Douglas County.
Y-axis: Represents the number of zip codes fitting the client's budget, ranging from 0 to 20, with increments of 2.5.

### Key Observations:
Omaha: Has the highest number of zip codes fitting the client's budget, with approximately 18 zip codes.
Bellevue: Has a moderate number of zip codes, around 5.
Other Cities: Cities like La Vista, Ralston, Millard, Bennington, Waterloo, Elkhorn, Valley, Cedar Bluffs, Gretna, Springfield, Papillion, and Boys Town have fewer zip codes, mostly around 1 or less.

This bar chart effectively illustrates that Omaha has the most zip codes that fit the client's budget, significantly more than any other city in Douglas County. 

In [18]:
#Douglas_County_budget
melted_temp = pd.melt(Douglas_County_budget, id_vars=['RegionName', 'RegionID', 'SizeRank', 'City', 'State', 'Metro', 'CountyName'], var_name='time')
melted_temp.head(1000)

melted_zip = melted_temp[(melted_temp['Metro'] == 'Omaha-Council Bluffs, NE-IA') & (melted_temp['time'] != 'RegionType') & (melted_temp['time'] != 'StateName')]## & (melted_temp['RegionName'] == '68104')]
##melted_zip.to_csv("C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\output_sb.csv")
In [19]:
import pandas as pd
import matplotlib.pyplot as plt

#melted_zip = pd.read_csv('C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\output_sb.csv')
# Ensure 'melted_zip' is a copy to avoid SettingWithCopyWarning
melted_zip_time = melted_zip.copy()

# Convert 'time' column to datetime format
melted_zip_time.loc[:, 'time'] = pd.to_datetime(melted_zip_time['time'])

# Drop rows with missing values in 'value' column
melted_zip_time = melted_zip_time.dropna(subset=['value'])

# Convert 'value' column to numeric, forcing errors to NaN
melted_zip_time.loc[:, 'value'] = pd.to_numeric(melted_zip_time['value'], errors='coerce')

# Drop any remaining NaN values
df_cleaned_time = melted_zip_time.dropna(subset=['value'])
df_cleaned_time['year'] = df_cleaned_time['time'].dt.year

# Plot time series for each zip code
plt.figure(figsize=(12, 6))
for zip_code, data in df_cleaned_time.groupby('RegionName'):
    plt.plot(data['time'], data['value'], label=zip_code, alpha=0.5)

plt.xlabel("Time", fontsize=12)
plt.ylabel("Value", fontsize=12)
plt.title("Time Series Plot of Property Values for Each Zip Code", fontsize=14)
plt.xticks(rotation=45, fontsize=10)  # Rotate x-axis labels for better readability
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), title="Zip Code", ncol=2)
plt.grid(True)
plt.tight_layout()  # Prevents label overlap
plt.show()
In [20]:
print('''
The time series chart titled Time Series Plot of Property Values for Each Zip Code shows the trends in property values over time for different zip codes. 

X-axis: Represents the time in years, ranging from 2000 to 2024.
Y-axis: Represents the property values.

### Key Observations:
Multiple Lines: Each line on the chart corresponds to a different zip code, showing how property values have changed over time for each zip code.
Trends: Most lines show an increase in property values over the period, particularly after 2012. This indicates a general upward trend in property values for the zip codes included in the chart.
Legend: The legend on the right side lists the zip codes, helping to identify which line corresponds to which zip code.

This chart effectively illustrates the historical trends in property values for various zip codes, highlighting periods of growth and stability. 

''')
The time series chart titled Time Series Plot of Property Values for Each Zip Code shows the trends in property values over time for different zip codes. 

X-axis: Represents the time in years, ranging from 2000 to 2024.
Y-axis: Represents the property values.

### Key Observations:
Multiple Lines: Each line on the chart corresponds to a different zip code, showing how property values have changed over time for each zip code.
Trends: Most lines show an increase in property values over the period, particularly after 2012. This indicates a general upward trend in property values for the zip codes included in the chart.
Legend: The legend on the right side lists the zip codes, helping to identify which line corresponds to which zip code.

This chart effectively illustrates the historical trends in property values for various zip codes, highlighting periods of growth and stability. 


In [21]:
##Distribution of Property Prices
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd


plt.figure(figsize=(10, 5))
sns.histplot(df_cleaned_time['value'], bins=50, kde=True)
plt.xlabel("Property Value")
plt.ylabel("Frequency")
plt.title("Distribution of Property Prices")
plt.grid(True)
plt.show()
In [22]:
print('''The chart "Distribution of Property Prices" visualizes the distribution of property values. 

### Summary of the Chart:
X-axis:: Represents property values, ranging from 0 to 300,000.
Y-axis: Represents the frequency of properties within each value range.
Histogram: Shows the distribution of property values with 50 bins.
KDE Line: A Kernel Density Estimate (KDE) line is overlaid to provide a smooth estimate of the probability density function for property values.

### Key Observations:
--> Most property values are concentrated between approximately 50,000 and 150,000.
-->There is a significant peak in frequency around property values between 75,000 and 100,000.

### Interpretation of the Chart:
-->Property Value Range: The majority of properties fall within the 50,000 to 150,000 range, suggesting that this is the most common price range for properties in the dataset.
-->High Demand Range: The peak around 75,000 to 100,000 indicates a high demand or a large number of properties valued within this range.

This chart effectively illustrates the distribution of property prices, highlighting the most common value ranges and providing a clear picture of the property market.
''')
The chart "Distribution of Property Prices" visualizes the distribution of property values. 

### Summary of the Chart:
X-axis:: Represents property values, ranging from 0 to 300,000.
Y-axis: Represents the frequency of properties within each value range.
Histogram: Shows the distribution of property values with 50 bins.
KDE Line: A Kernel Density Estimate (KDE) line is overlaid to provide a smooth estimate of the probability density function for property values.

### Key Observations:
--> Most property values are concentrated between approximately 50,000 and 150,000.
-->There is a significant peak in frequency around property values between 75,000 and 100,000.

### Interpretation of the Chart:
-->Property Value Range: The majority of properties fall within the 50,000 to 150,000 range, suggesting that this is the most common price range for properties in the dataset.
-->High Demand Range: The peak around 75,000 to 100,000 indicates a high demand or a large number of properties valued within this range.

This chart effectively illustrates the distribution of property prices, highlighting the most common value ranges and providing a clear picture of the property market.

In [23]:
df_cleaned_time.head(5)
Out[23]:
RegionName RegionID SizeRank City State Metro CountyName time value year
70 68104 87855 2327 Omaha NE Omaha-Council Bluffs, NE-IA Douglas County 2000-01-31 61845.391049 2000
71 68134 87880 3356 Omaha NE Omaha-Council Bluffs, NE-IA Douglas County 2000-01-31 98567.912015 2000
73 68164 87894 3801 Omaha NE Omaha-Council Bluffs, NE-IA Douglas County 2000-01-31 121613.700108 2000
74 68137 87883 4795 Omaha NE Omaha-Council Bluffs, NE-IA Douglas County 2000-01-31 115569.772369 2000
75 68144 87887 4992 Omaha NE Omaha-Council Bluffs, NE-IA Douglas County 2000-01-31 117433.426333 2000
In [24]:
##Boxplot of Property Values by Year
plt.figure(figsize=(12, 6))
sns.boxplot(x='year', y='value', data=df_cleaned_time)
plt.xticks(rotation=90)
plt.xlabel("Year")
plt.ylabel("Property Value")
plt.title("Property Value Distribution Over the Years")
plt.grid(True)
plt.show()
In [25]:
print('''The chart is a boxplot of property values by year from 2000 to 2024. 

### Summary of the Chart:
X-axis: Represents the years from 2000 to 2024.
Y-axis: Represents property values.
-->Boxplot: Each boxplot corresponds to a specific year and shows the distribution of property values for that year.
-->Boxes: Represent the interquartile range (IQR), with the line inside each box indicating the median value.
-->Whiskers: Extend from each box to show variability outside the upper and lower quartiles.
-->Outliers: Plotted as individual points beyond the whiskers, indicating property values significantly higher or lower than the majority.

### Interpretation of the Chart:
-->Increasing Property Values: Property values have generally increased over time from 2000 to 2024.
-->Variability: There is noticeable variability in property values each year, with some years showing wider IQRs than others.
-->Rising Median Values: Median property values have risen consistently over this period, indicating an overall upward trend in property prices.
-->Outliers: Present in most years, suggesting that there are some properties with significantly higher or lower values compared to the majority.

This boxplot effectively illustrates the distribution and trends of property values over time, highlighting the 
overall increase in property prices and the variability within each year.
'''
     )
The chart is a boxplot of property values by year from 2000 to 2024. 

### Summary of the Chart:
X-axis: Represents the years from 2000 to 2024.
Y-axis: Represents property values.
-->Boxplot: Each boxplot corresponds to a specific year and shows the distribution of property values for that year.
-->Boxes: Represent the interquartile range (IQR), with the line inside each box indicating the median value.
-->Whiskers: Extend from each box to show variability outside the upper and lower quartiles.
-->Outliers: Plotted as individual points beyond the whiskers, indicating property values significantly higher or lower than the majority.

### Interpretation of the Chart:
-->Increasing Property Values: Property values have generally increased over time from 2000 to 2024.
-->Variability: There is noticeable variability in property values each year, with some years showing wider IQRs than others.
-->Rising Median Values: Median property values have risen consistently over this period, indicating an overall upward trend in property prices.
-->Outliers: Present in most years, suggesting that there are some properties with significantly higher or lower values compared to the majority.

This boxplot effectively illustrates the distribution and trends of property values over time, highlighting the 
overall increase in property prices and the variability within each year.

In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.cluster import KMeans
import statsmodels.api as sm

# Load the data from the CSV file
data =  df #pd.read_csv('C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\Zip_zhvi_uc_sfr_tier_0.33_0.67_sm_sa_month.csv')


# Display the first few rows of the dataframe
print(data.head(30))
    RegionID  SizeRank  RegionName RegionType StateName State           City  \
0      91982         1       77494        zip        TX    TX           Katy   
1      61148         2        8701        zip        NJ    NJ       Lakewood   
2      91940         3       77449        zip        TX    TX           Katy   
3      62080         4       11368        zip        NY    NY       New York   
4      91733         5       77084        zip        TX    TX        Houston   
5      93144         6       79936        zip        TX    TX        El Paso   
6      62093         7       11385        zip        NY    NY       New York   
7      92593         8       78660        zip        TX    TX   Pflugerville   
8      62019         9       11208        zip        NY    NY       New York   
9      95992        10       90011        zip        CA    CA    Los Angeles   
10     91926        11       77433        zip        TX    TX        Cypress   
11     84630        12       60629        zip        IL    IL        Chicago   
12     96193        13       90650        zip        CA    CA        Norwalk   
13     96361        14       91331        zip        CA    CA    Los Angeles   
14     74101        15       37013        zip        TN    TN      Nashville   
15     62046        16       11236        zip        NY    NY       New York   
16     61807        17       10467        zip        NY    NY       New York   
17     62085        18       11373        zip        NY    NY       New York   
18     62037        19       11226        zip        NY    NY       New York   
19     62018        20       11207        zip        NY    NY       New York   
20     70829        21       30044        zip        GA    GA  Lawrenceville   
21     96025        22       90044        zip        CA    CA    Los Angeles   
22     96817        23       92336        zip        CA    CA        Fontana   
23     97771        24       94565        zip        CA    CA      Pittsburg   
24     92036        25       77573        zip        TX    TX    League City   
25     96816        26       92335        zip        CA    CA        Fontana   
26     96083        27       90201        zip        CA    CA           Bell   
27     96107        28       90250        zip        CA    CA      Hawthorne   
28     61639        29       10025        zip        NY    NY       New York   
29     90654        30       75052        zip        TX    TX  Grand Prairie   

                                             Metro             CountyName  \
0             Houston-The Woodlands-Sugar Land, TX       Fort Bend County   
1            New York-Newark-Jersey City, NY-NJ-PA           Ocean County   
2             Houston-The Woodlands-Sugar Land, TX          Harris County   
3            New York-Newark-Jersey City, NY-NJ-PA          Queens County   
4             Houston-The Woodlands-Sugar Land, TX          Harris County   
5                                      El Paso, TX         El Paso County   
6            New York-Newark-Jersey City, NY-NJ-PA          Queens County   
7                 Austin-Round Rock-Georgetown, TX          Travis County   
8            New York-Newark-Jersey City, NY-NJ-PA           Kings County   
9               Los Angeles-Long Beach-Anaheim, CA     Los Angeles County   
10            Houston-The Woodlands-Sugar Land, TX          Harris County   
11              Chicago-Naperville-Elgin, IL-IN-WI            Cook County   
12              Los Angeles-Long Beach-Anaheim, CA     Los Angeles County   
13              Los Angeles-Long Beach-Anaheim, CA     Los Angeles County   
14  Nashville-Davidson--Murfreesboro--Franklin, TN        Davidson County   
15           New York-Newark-Jersey City, NY-NJ-PA           Kings County   
16           New York-Newark-Jersey City, NY-NJ-PA           Bronx County   
17           New York-Newark-Jersey City, NY-NJ-PA          Queens County   
18           New York-Newark-Jersey City, NY-NJ-PA           Kings County   
19           New York-Newark-Jersey City, NY-NJ-PA           Kings County   
20            Atlanta-Sandy Springs-Alpharetta, GA        Gwinnett County   
21              Los Angeles-Long Beach-Anaheim, CA     Los Angeles County   
22            Riverside-San Bernardino-Ontario, CA  San Bernardino County   
23              San Francisco-Oakland-Berkeley, CA    Contra Costa County   
24            Houston-The Woodlands-Sugar Land, TX       Galveston County   
25            Riverside-San Bernardino-Ontario, CA  San Bernardino County   
26              Los Angeles-Long Beach-Anaheim, CA     Los Angeles County   
27              Los Angeles-Long Beach-Anaheim, CA     Los Angeles County   
28           New York-Newark-Jersey City, NY-NJ-PA        New York County   
29                 Dallas-Fort Worth-Arlington, TX          Dallas County   

       2000-01-31  ...    2024-03-31    2024-04-30    2024-05-31  \
0   209183.857849  ...  4.909312e+05  4.934319e+05  4.947298e+05   
1   154922.985332  ...  6.891066e+05  6.996618e+05  7.085600e+05   
2   103712.508410  ...  2.807547e+05  2.817178e+05  2.822551e+05   
3   302696.952652  ...  8.859469e+05  8.950385e+05  9.013966e+05   
4   102666.820965  ...  2.746095e+05  2.756057e+05  2.761059e+05   
5    89296.318317  ...  2.087716e+05  2.101550e+05  2.112621e+05   
6   275372.206968  ...  8.165990e+05  8.218378e+05  8.249659e+05   
7   156630.893790  ...  4.038166e+05  4.036711e+05  4.032774e+05   
8   187881.614494  ...  6.795292e+05  6.835815e+05  6.864174e+05   
9   112751.607515  ...  5.682337e+05  5.665444e+05  5.674512e+05   
10  188219.104775  ...  4.303793e+05  4.320427e+05  4.327555e+05   
11   94129.380468  ...  2.357525e+05  2.389012e+05  2.415132e+05   
12  157854.835028  ...  7.201091e+05  7.218172e+05  7.254325e+05   
13  125683.830348  ...  7.063197e+05  7.056936e+05  7.072474e+05   
14  123561.218927  ...  3.669590e+05  3.681828e+05  3.693311e+05   
15  216374.206832  ...  6.705887e+05  6.733294e+05  6.745830e+05   
16  165851.804503  ...  6.091809e+05  6.158641e+05  6.201697e+05   
17  280787.914972  ...  9.187083e+05  9.252334e+05  9.309053e+05   
18  263959.505980  ...  1.106530e+06  1.116589e+06  1.126436e+06   
19  173128.169383  ...  6.416483e+05  6.440602e+05  6.449091e+05   
20  135548.780295  ...  3.618449e+05  3.641860e+05  3.660326e+05   
21  127861.032846  ...  6.156394e+05  6.142426e+05  6.153391e+05   
22  162735.847249  ...  6.842914e+05  6.883012e+05  6.927089e+05   
23  128210.120549  ...  5.834464e+05  5.872290e+05  5.907028e+05   
24            NaN  ...  3.795797e+05  3.805545e+05  3.813277e+05   
25  100734.402730  ...  5.405824e+05  5.420519e+05  5.453330e+05   
26  143328.878914  ...  6.540787e+05  6.541549e+05  6.553819e+05   
27  192046.275566  ...  8.390104e+05  8.374998e+05  8.397481e+05   
28            NaN  ...  1.375445e+06  1.366103e+06  1.357637e+06   
29  106423.904221  ...  3.303254e+05  3.307599e+05  3.308807e+05   

      2024-06-30    2024-07-31    2024-08-31    2024-09-30    2024-10-31  \
0   4.950287e+05  4.946511e+05  4.950584e+05  4.957476e+05  4.968702e+05   
1   7.150946e+05  7.204438e+05  7.260000e+05  7.331032e+05  7.400830e+05   
2   2.822183e+05  2.819864e+05  2.818071e+05  2.816209e+05  2.812067e+05   
3   9.019673e+05  9.006063e+05  8.998854e+05  9.024597e+05  9.046173e+05   
4   2.759357e+05  2.756418e+05  2.753747e+05  2.751303e+05  2.746319e+05   
5   2.117728e+05  2.119963e+05  2.121595e+05  2.124250e+05  2.126144e+05   
6   8.241853e+05  8.235255e+05  8.248570e+05  8.286172e+05  8.320040e+05   
7   4.022068e+05  4.008842e+05  3.997884e+05  3.992148e+05  3.982709e+05   
8   6.865275e+05  6.868643e+05  6.877204e+05  6.895644e+05  6.899895e+05   
9   5.683689e+05  5.709211e+05  5.749465e+05  5.808213e+05  5.862152e+05   
10  4.323321e+05  4.313540e+05  4.309533e+05  4.309909e+05  4.310984e+05   
11  2.439708e+05  2.455966e+05  2.473541e+05  2.485644e+05  2.496615e+05   
12  7.283206e+05  7.323845e+05  7.374218e+05  7.433161e+05  7.478709e+05   
13  7.096395e+05  7.141819e+05  7.209321e+05  7.282843e+05  7.344391e+05   
14  3.699239e+05  3.704216e+05  3.703718e+05  3.702954e+05  3.699462e+05   
15  6.736895e+05  6.732175e+05  6.744504e+05  6.778572e+05  6.812262e+05   
16  6.206369e+05  6.191099e+05  6.170219e+05  6.152134e+05  6.139342e+05   
17  9.315913e+05  9.313808e+05  9.311822e+05  9.320977e+05  9.347486e+05   
18  1.131265e+06  1.132676e+06  1.133803e+06  1.136250e+06  1.137471e+06   
19  6.440674e+05  6.433077e+05  6.434067e+05  6.445494e+05  6.454747e+05   
20  3.671151e+05  3.679698e+05  3.686291e+05  3.688152e+05  3.683133e+05   
21  6.166183e+05  6.195873e+05  6.240806e+05  6.301036e+05  6.351262e+05   
22  6.969221e+05  7.004450e+05  7.027734e+05  7.049159e+05  7.065207e+05   
23  5.918456e+05  5.915789e+05  5.906498e+05  5.890761e+05  5.868436e+05   
24  3.817400e+05  3.820621e+05  3.823446e+05  3.827447e+05  3.829185e+05   
25  5.483909e+05  5.513960e+05  5.534808e+05  5.559602e+05  5.578559e+05   
26  6.557622e+05  6.578933e+05  6.622184e+05  6.686316e+05  6.737822e+05   
27  8.419164e+05  8.458017e+05  8.503824e+05  8.566929e+05  8.622520e+05   
28  1.341787e+06  1.344609e+06  1.359405e+06  1.372631e+06  1.370689e+06   
29  3.306506e+05  3.304376e+05  3.302504e+05  3.301694e+05  3.298518e+05   

      2024-11-30    2024-12-31  
0   4.975912e+05  4.980134e+05  
1   7.451605e+05  7.479648e+05  
2   2.804223e+05  2.796971e+05  
3   9.058188e+05  9.058963e+05  
4   2.740395e+05  2.737062e+05  
5   2.127732e+05  2.129577e+05  
6   8.348912e+05  8.369265e+05  
7   3.970073e+05  3.950774e+05  
8   6.896089e+05  6.882254e+05  
9   5.909170e+05  5.945645e+05  
10  4.307925e+05  4.304035e+05  
11  2.507528e+05  2.518538e+05  
12  7.510449e+05  7.533441e+05  
13  7.395935e+05  7.448252e+05  
14  3.697126e+05  3.695855e+05  
15  6.846801e+05  6.865968e+05  
16  6.110946e+05  6.065055e+05  
17  9.388682e+05  9.445741e+05  
18  1.137110e+06  1.138285e+06  
19  6.457403e+05  6.442788e+05  
20  3.671993e+05  3.661526e+05  
21  6.391128e+05  6.418200e+05  
22  7.084834e+05  7.096680e+05  
23  5.851863e+05  5.845002e+05  
24  3.827001e+05  3.823055e+05  
25  5.602846e+05  5.622288e+05  
26  6.768174e+05  6.785491e+05  
27  8.682955e+05  8.738884e+05  
28  1.365597e+06  1.362692e+06  
29  3.292662e+05  3.284714e+05  

[30 rows x 309 columns]
In [27]:
# Bar Chart
#plt.figure(figsize=(10, 6))
#data['RegionName'].value_counts().plot(kind='bar')
#plt.title('Distribution of Region Names')
#plt.xlabel('Region Name')
#plt.ylabel('Count')
#plt.show()
In [28]:
# QQ Plot
plt.figure(figsize=(10, 6))
sm.qqplot(data['2024-01-31'], line ='45')
plt.title('QQ Plot of Home Values on 01/31/2024')
plt.show()
<Figure size 720x432 with 0 Axes>
In [29]:
print('''
Summary of the Chart:
The QQ plot (Quantile-Quantile plot) compares the distribution of home values on 01/31/2003 to a theoretical normal distribution. 
Points on the plot represent the quantiles of the data against the quantiles of the normal distribution.

Interpretation of the Chart:
If the data follows a normal distribution, the points will lie along the 45-degree reference line. 
Deviations from this line indicate departures from normality. 
In this case, the QQ plot helps us assess whether the home values on 01/31/2004 are normally distributed. 
Significant deviations from the line suggest that the data may not be normally distributed.
''')
Summary of the Chart:
The QQ plot (Quantile-Quantile plot) compares the distribution of home values on 01/31/2003 to a theoretical normal distribution. 
Points on the plot represent the quantiles of the data against the quantiles of the normal distribution.

Interpretation of the Chart:
If the data follows a normal distribution, the points will lie along the 45-degree reference line. 
Deviations from this line indicate departures from normality. 
In this case, the QQ plot helps us assess whether the home values on 01/31/2004 are normally distributed. 
Significant deviations from the line suggest that the data may not be normally distributed.

In [30]:
# Pair Plot (Scatter plot and Density Plots)
sns.pairplot(data[['2024-10-31', '2024-11-30', '2024-12-31']])
plt.show()
In [31]:
print('''
##Summary of the Chart:
-->The pair plot includes scatter plots for each pair of variables and density plots along the diagonal. 
]The variables are dates: `2024-10-31`, `2024-11-30`, and `2024-12-31`.

##Interpretation of the Chart:
1.Diagonal Elements:
   --> The diagonal elements are density plots showing the distribution of values for each date.
   --> For `2024-10-31`, `2024-11-30`, and `2024-12-31`, there is a clear peak indicating that most values are concentrated around a specific range.

2. Off-Diagonal Elements:
   --> The off-diagonal elements are scatter plots showing pairwise relationships between dates.
   --> There is a strong positive linear relationship between all pairs (`2024-10-31` vs. `2024-11-30`, `2024-10-31` vs. `2024-12-31`, and `2024-11-30` vs. `2024-12-31`). 
   This is indicated by points forming an upward-sloping line in each scatter plot.

There is a strong correlation between the data points on these three different dates, suggesting that as one variable increases, 
so do the others in a linear fashion. 
This can be useful for understanding trends and patterns in the data over time.
''')
##Summary of the Chart:
-->The pair plot includes scatter plots for each pair of variables and density plots along the diagonal. 
]The variables are dates: `2024-10-31`, `2024-11-30`, and `2024-12-31`.

##Interpretation of the Chart:
1.Diagonal Elements:
   --> The diagonal elements are density plots showing the distribution of values for each date.
   --> For `2024-10-31`, `2024-11-30`, and `2024-12-31`, there is a clear peak indicating that most values are concentrated around a specific range.

2. Off-Diagonal Elements:
   --> The off-diagonal elements are scatter plots showing pairwise relationships between dates.
   --> There is a strong positive linear relationship between all pairs (`2024-10-31` vs. `2024-11-30`, `2024-10-31` vs. `2024-12-31`, and `2024-11-30` vs. `2024-12-31`). 
   This is indicated by points forming an upward-sloping line in each scatter plot.

There is a strong correlation between the data points on these three different dates, suggesting that as one variable increases, 
so do the others in a linear fashion. 
This can be useful for understanding trends and patterns in the data over time.

In [32]:
# KMeans Clustering
kmeans = KMeans(n_clusters=3)
data = data.dropna(subset=['2024-10-31', '2024-11-30', '2024-12-31'])
data['Cluster'] = kmeans.fit_predict(data[['2024-10-31', '2024-11-30', '2024-12-31']])

# Scatter plot with clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x='2024-10-31' ,y ='2024-11-30', hue='Cluster', data=data, palette='viridis')
plt.title('KMeans Clustering')
plt.xlabel('Home Values on 2024-10-31')
plt.ylabel('Home Values on 2024-11-30')
plt.show()
In [33]:
print('''

--> The chart is a scatter plot showing the results of KMeans clustering on home values from two different dates: 2024-10-31 and 2024-11-30. 
The data points are color-coded based on the cluster they belong to, with three clusters in total (Cluster 0, Cluster 1, and Cluster 2). 
The x-axis represents home values on 2024-10-31, 
while the y-axis represents home values on 2024-11-30.

-->Cluster 0 (yellow): This cluster has a dense concentration along the diagonal line,
      indicating that for these homes, the values remained relatively stable between the two dates.
-->Cluster 1 (purple): This cluster also follows a similar pattern but is more spread out compared to Cluster 0.
-->Cluster 2 (green): This cluster shows a wider distribution with some outliers indicating significant changes 
      in home values between the two dates.

-->Trend Analysis:
   - The diagonal trend suggests that most homes did not experience drastic changes in value over this one-month period.
   - Outliers in each cluster indicate homes that had significant increases or decreases in value.

there is a strong correlation between the home values on these two dates, with most homes showing stable values. 
The clustering helps identify groups of homes with similar value trends, providing insights into regional trends 
and market segmentation.
''')

--> The chart is a scatter plot showing the results of KMeans clustering on home values from two different dates: 2024-10-31 and 2024-11-30. 
The data points are color-coded based on the cluster they belong to, with three clusters in total (Cluster 0, Cluster 1, and Cluster 2). 
The x-axis represents home values on 2024-10-31, 
while the y-axis represents home values on 2024-11-30.

-->Cluster 0 (yellow): This cluster has a dense concentration along the diagonal line,
      indicating that for these homes, the values remained relatively stable between the two dates.
-->Cluster 1 (purple): This cluster also follows a similar pattern but is more spread out compared to Cluster 0.
-->Cluster 2 (green): This cluster shows a wider distribution with some outliers indicating significant changes 
      in home values between the two dates.

-->Trend Analysis:
   - The diagonal trend suggests that most homes did not experience drastic changes in value over this one-month period.
   - Outliers in each cluster indicate homes that had significant increases or decreases in value.

there is a strong correlation between the home values on these two dates, with most homes showing stable values. 
The clustering helps identify groups of homes with similar value trends, providing insights into regional trends 
and market segmentation.

In [34]:
import os
import pandas as pd

#Function to read all CSV files from a folder and write to a dataframe
def read_csv_files_from_folder(folder_path, encoding='utf-8'):
    # List to store individual dataframes
    df_list = []
    
    # Iterate over all files in the folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a CSV file
        if file_name.endswith('.csv'):
            # Read the CSV file and append the dataframe to the list
            file_path = os.path.join(folder_path, file_name)
            try:
                df = pd.read_csv(file_path, encoding=encoding)
                df_list.append(df)
            except UnicodeDecodeError:
                print(f"Encoding error in file: {file_name}")
    
    # Concatenate all dataframes into a single dataframe
    combined_df = pd.concat(df_list, ignore_index=True)
    
    return combined_df

#Specify the folder path containing the CSV files
folder_path = 'C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\Files used\\CAINC1'

#Read data from all CSV files in the folder and write to a dataframe
combined_df = read_csv_files_from_folder(folder_path, encoding='latin1')
combined_df.to_csv("C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\output_sb_income.csv")

combined_df['GeoFIPS'] = combined_df['GeoFIPS'].astype(str).str.replace('"', '')

#Display the first few rows of the combined dataframe
print(combined_df.head())
  GeoFIPS                            GeoName Region TableName  LineCode  \
0   02000                             Alaska      8    CAINC1       1.0   
1   02000                             Alaska      8    CAINC1       2.0   
2   02000                             Alaska      8    CAINC1       3.0   
3   02010  Aleutian Islands Census Area, AK*      8    CAINC1       1.0   
4   02010  Aleutian Islands Census Area, AK*      8    CAINC1       2.0   

  IndustryClassification                              Description  \
0                    ...  Personal income (thousands of dollars)    
1                    ...                  Population (persons) 1/   
2                    ...  Per capita personal income (dollars) 2/   
3                    ...  Personal income (thousands of dollars)    
4                    ...                  Population (persons) 1/   

                   Unit     1969     1970  ...      2014      2015      2016  \
0  Thousands of dollars  1575306  1798953  ...  40642115  41918639  41285498   
1     Number of persons   296000   304328  ...    737638    739127    743410   
2               Dollars     5322     5911  ...     55098     56714     55535   
3  Thousands of dollars     (NA)     (NA)  ...      (NA)      (NA)      (NA)   
4     Number of persons     (NA)     (NA)  ...      (NA)      (NA)      (NA)   

       2017      2018      2019      2020      2021      2022      2023  
0  41919375  43522749  44459670  45339636  48183676  50606751  52519998  
1    741949    737717    734823    732964    734923    733276    733406  
2     56499     58997     60504     61858     65563     69015     71611  
3      (NA)      (NA)      (NA)      (NA)      (NA)      (NA)      (NA)  
4      (NA)      (NA)      (NA)      (NA)      (NA)      (NA)      (NA)  

[5 rows x 63 columns]
In [35]:
combined_df_Personal_income = combined_df[combined_df['LineCode'] == 1.0]
combined_df_Per_capita_Personal_income = combined_df[combined_df['LineCode'] == 3.0]
combined_df_Population  = combined_df[combined_df['LineCode'] == 2.0]
In [36]:
combined_df_Personal_income.head()
Out[36]:
GeoFIPS GeoName Region TableName LineCode IndustryClassification Description Unit 1969 1970 ... 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023
0 02000 Alaska 8 CAINC1 1.0 ... Personal income (thousands of dollars) Thousands of dollars 1575306 1798953 ... 40642115 41918639 41285498 41919375 43522749 44459670 45339636 48183676 50606751 52519998
3 02010 Aleutian Islands Census Area, AK* 8 CAINC1 1.0 ... Personal income (thousands of dollars) Thousands of dollars (NA) (NA) ... (NA) (NA) (NA) (NA) (NA) (NA) (NA) (NA) (NA) (NA)
6 02013 Aleutians East Borough, AK* 8 CAINC1 1.0 ... Personal income (thousands of dollars) Thousands of dollars (NA) (NA) ... 133578 165850 191326 200602 192045 202667 195030 202501 217797 226422
9 02016 Aleutians West Census Area, AK* 8 CAINC1 1.0 ... Personal income (thousands of dollars) Thousands of dollars (NA) (NA) ... 299213 297307 306233 294135 305957 315261 326510 329054 350185 372727
12 02020 Anchorage Municipality, AK 8 CAINC1 1.0 ... Personal income (thousands of dollars) Thousands of dollars 772088 875085 ... 18194339 18812630 18446135 18629102 19242734 19383968 19745555 20786653 21871379 22652344

5 rows × 63 columns

In [37]:
##pip install addfips
import pandas as pd
import addfips

#Initialize the AddFIPS object
af = addfips.AddFIPS()

#Function to get the FIPS code based on county and state
def get_fips(row):
    return af.get_county_fips(row['CountyName'], row['State'])

#Add the GeoFIPS column to the df_zip DataFrame
df['GeoFIPS'] = df.apply(get_fips, axis=1)

df.head(5)
Out[37]:
RegionID SizeRank RegionName RegionType StateName State City Metro CountyName 2000-01-31 ... 2024-04-30 2024-05-31 2024-06-30 2024-07-31 2024-08-31 2024-09-30 2024-10-31 2024-11-30 2024-12-31 GeoFIPS
0 91982 1 77494 zip TX TX Katy Houston-The Woodlands-Sugar Land, TX Fort Bend County 209183.857849 ... 493431.867652 494729.768844 495028.711253 494651.061479 495058.369196 495747.641095 496870.232365 497591.216589 498013.381149 48157
1 61148 2 8701 zip NJ NJ Lakewood New York-Newark-Jersey City, NY-NJ-PA Ocean County 154922.985332 ... 699661.801780 708559.971483 715094.582736 720443.771066 726000.038135 733103.210025 740083.000281 745160.491341 747964.767711 34029
2 91940 3 77449 zip TX TX Katy Houston-The Woodlands-Sugar Land, TX Harris County 103712.508410 ... 281717.848068 282255.119080 282218.313457 281986.353330 281807.105654 281620.894517 281206.739260 280422.308405 279697.082664 48201
3 62080 4 11368 zip NY NY New York New York-Newark-Jersey City, NY-NJ-PA Queens County 302696.952652 ... 895038.513007 901396.636146 901967.282407 900606.279548 899885.353997 902459.710773 904617.265178 905818.780045 905896.339783 36081
4 91733 5 77084 zip TX TX Houston Houston-The Woodlands-Sugar Land, TX Harris County 102666.820965 ... 275605.670852 276105.868652 275935.732220 275641.755834 275374.731211 275130.328740 274631.886651 274039.496837 273706.228700 48201

5 rows × 310 columns

In [38]:
combined_df_Personal_income.to_csv("C:\\Users\\req98376\\Downloads\\pi.csv")
In [39]:
df.to_csv("C:\\Users\\req98376\\Downloads\\df.csv")
In [40]:
#Join the two DataFrames on the GeoFIPS column
#combined_df = pd.merge(combined_df_Personal_income, df, on='GeoFIPS', how='inner')

#Display the first few rows of the combined DataFrame
#print(combined_df.head())
In [41]:
##pip install folium
In [42]:
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import folium


# Load the data from the CSV files
zip_lat_long_data = pd.read_csv('C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\Files used\\USZipsWithLatLon_20231227.csv')  # File with zip codes, latitude, and longitude
zip_data = df #pd.read_csv('C:\\Users\\req98376\\Downloads\\housing predictaions\\data\\Files used\\Zip_zhvi_uc_sfr_tier_0.33_0.67_sm_sa_month.csv')  # File with zip codes

# Merge the data based on zip codes
merged_data = pd.merge(zip_data, zip_lat_long_data, left_on='RegionName', right_on='postal code')
#merged_data.head(2)

# Draw the Map

sanfran_map=folium.Map(location=[41.2403, -95.7129], zoom_start=4)

incidents = folium.map.FeatureGroup()

for lat, lng, in zip(merged_data.latitude ,merged_data.longitude):
    incidents.add_child(
        folium.features.CircleMarker(
            [lat, lng],
            radius=5, 
            color='yellow',
            fill=True,
            fill_color='blue',
            fill_opacity=0.6,
        
        )
    )

# add incidents to map
sanfran_map.add_child(incidents)
Out[42]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [43]:
# Create a map centered around the USA
usa_map = folium.Map(location=[41.2403, -95.7129], zoom_start=4)
#merged_data = merged_data[merged_data['Metro'] =='Omaha-Council Bluffs, NE-IA']

grouped_data = merged_data.groupby('State').apply(lambda x: x.nlargest(2, '2024-12-31')).reset_index(drop=True)

# Add points to the map
for index, row in grouped_data.iterrows():
    folium.Marker([row['latitude'], row['longitude']], popup=row['RegionName']).add_to(usa_map)
## Save the map to an HTML file
usa_map.save('usa_map.html')


# Display the map
usa_map
Out[43]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [44]:
print('''
the above USA map shows top two cities for each state which has highest values as of date 2024-12-31
the cities are shown in the US MAP.''')
the above USA map shows top two cities for each state which has highest values as of date 2024-12-31
the cities are shown in the US MAP.